library(readxl)
library(tidyverse)
library(psych)
library(scales)
setwd("C:/Users/tradingbills/Documents/_exer/_data/math/wk4/")

# 1 compute the covariance
# COV = Sum( (x_i - x_bar) * (y-i -y_bar)) / N-1
# COVARIANCE
covariance <- function(x,y){
  numerator <- sum(( x - mean(x)) * ( y - mean(y))) 
  denominator <- length(x) - 1
  return  (numerator/denominator)
}

cereal <- read_excel('Cereals.xlsx')
x <- cereal$Sugar
y <- cereal$Calories
covariance(x,y)

cereal %>% ggplot(aes(Sugar,Calories)) +
  theme_bw() +
  geom_point()

# COVARIANCE
covar_of_cereal <- covariance(x,y)
# r (coefficient of correlation) pearson
r_of_cereal <- covar_of_cereal/(sd(x) * sd(y))
cor(x, y)
round(r_of_cereal, digits = 4) == round(cor(x, y), digits = 4)  # [1] TRUE
cor(x, y, method = "spearman")
r_sqrd <-  (cor(x, y))^2 # [1] 0.8561823
fit <- lm(y~x)
summary(fit)

#2 
football <- read_excel("College Football.xlsx")
glimpse(football)

ds02 <- football %>% 
  select(`Total Pay`, `Football Net Revenue`) %>% 
  rename(ttl_pay = `Total Pay`,
         revenue = `Football Net Revenue`)
ds02 %>% ggplot(aes(revenue, ttl_pay )) +
  theme_bw() +
  geom_point()

x = ds02$revenue
y = ds02$ttl_pay
# COVARIANCE
numerator <- sum(( x - mean(x)) * ( y - mean(y))) 
denominator <- length(ds02$revenue) - 1
covar_of_football <- numerator/denominator
r_of_football <- covar_of_football/(sd(x) * sd(y))
cor(x, y)
cor(x, y, method = "spearman")
round(r_of_football, digits = 4) == round(cor(x, y), digits = 4)  # [1] TRUE
r_sqrd <-  (cor(cereal$x, cereal$y))^2 # [1] 0.8561823

#3 HDL_cholesterol
hdl <- read_excel('HDL_cholesterol.xlsx')
hdl %>% ggplot(aes(x=Age, y=Cholesterol)) +
  theme_bw() +
  geom_point()

# COVARIANCE
x <- hdl$Age
y <- hdl$Cholesterol
numerator <- sum(( x - mean(x)) * ( y - mean(y))) 
denominator <- length(hdl$Age) - 1
covar_of_hdl <- numerator/denominator
r_of_hdl <- covar_of_hdl/(sd(x) * sd(y))
cor(x, y)
cor(x, y, method = "spearman")
round(r_of_hdl, digits = 4) == round(cor(x, y), digits = 4)  # [1] TRUE

#4 MRI
mri <- read_excel('MRI_IQ.xlsx')
#mri as explanatory and iq as response
breaks_log10 <- function(x){
  low <- floor(log(min(x), base = 8))
  high <- ceiling(log10(max(x)))
  10^(seq.int(low, high))
}

ggplot(mri,aes(x=MRI_COUNT, y=IQ, shape = GENDER)) +
  geom_point()+
  scale_shape_manual(values = c(24, 16 )) 

female <- mri %>% 
  filter(GENDER == "F")
cor(female$MRI_COUNT, female$IQ)
male <- mri %>% 
  filter(GENDER == "M")
cor(male$MRI_COUNT, male$IQ)

#5
library(lsr)
baseball <- read_excel("Baseball P14.xlsx")
correlate(baseball)
Filter(is.numeric, baseball) %>% 
  cor()

#8
judging <- read_excel('Judging.xlsx')
cor(judging$Judge_1_Score, judging$Judge_2_Score, method = "spearman")
cor(judging$Judge_1_Score, judging$Judge_2_Score, method = "pearson")
